logo

Introduction

The purpose of this project is to analyze how professional basketball team schedules influence performance. Using data from recent seasons, it explores how schedule congestion, travel demands, and home/away balance affect teams over time. All analysis was completed using tidyverse, ggplot2, and plotly. The provided data was used as truth (for example, some “home” games have been played at secondary locations, including TOR’s entire 2020-21 season. These are not reflected in the data, so it is not accounted for) Note that the 2024-25 schedules in schedule_24_partial.csv intentionally include only 80 games, as the league holds 2 games out for each team in the middle of December due to unknown NBA Cup matchups.

Note:
Throughout this document, any season column represents the year each season started. For example, the 2015-16 season will be in the dataset as 2015. We may refer to a season by just this number (e.g. 2015) instead of the full text (e.g. 2015-16).

Setup and Data

library(tidyverse)
schedule <- read_csv("~/Downloads/schedule.csv")
draft_schedule <- read_csv("~/Downloads/schedule_24_partial.csv")
locations <- read_csv("~/Downloads/locations.csv")
game_data <- read_csv("~/Downloads/team_game_data.csv")

Part 1 – Schedule Congestion Analysis

#The first step identifies periods where your selected team plays four games in six nights. (Note: The stretches can overlap)
#OKC is team chosen, any team can be inserted. Apply to 2024-25 schedule.

draft_schedule_OKC <- draft_schedule %>%
  filter(team == "OKC")

count_4in6<- function(data, date_col){
  data %>%
    # Standarize the data 
    transmute(gamedate = as.Date({{date_col}})) %>% 
    arrange(gamedate) %>%
    mutate(n_games = purrr::map_int(gamedate, ~ sum(gamedate >= .x & gamedate <= .x+5))) %>%
    filter(n_games >= 4) %>%
  mutate(window_end = gamedate + 5) %>%
  select(window_start = gamedate, window_end, n_games)
}
res <- count_4in6(draft_schedule_OKC, gamedate)
res
## # A tibble: 26 × 3
##    window_start window_end n_games
##    <date>       <date>       <int>
##  1 2024-10-30   2024-11-04       4
##  2 2024-11-01   2024-11-06       4
##  3 2024-11-06   2024-11-11       4
##  4 2024-11-08   2024-11-13       4
##  5 2024-11-10   2024-11-15       4
##  6 2024-11-15   2024-11-20       4
##  7 2024-12-26   2024-12-31       4
##  8 2024-12-28   2025-01-02       4
##  9 2024-12-29   2025-01-03       4
## 10 2024-12-31   2025-01-05       4
## # ℹ 16 more rows
nrow(res)
## [1] 26

Part 2

#Apply to find historical four in six averages

by_team_season <- schedule %>%
  filter(season >= 2014, season <= 2023) %>%
  group_by(team,season) %>%
  summarise(games_played = n(),
            four_in_six = nrow(count_4in6(pick(gamedate), gamedate)), 
            .groups = "drop"
  ) %>%
  mutate(four_in_six_per82 = four_in_six * 82 / games_played)
overall_avg_q2 <- by_team_season %>% 
  summarise(avg_4in6_per82 = mean(four_in_six_per82, na.rm = TRUE))
overall_avg_q2
## # A tibble: 1 × 1
##   avg_4in6_per82
##            <dbl>
## 1           25.1

#Locate highest average number of 4-in-6 stretches between 2014-15 and 2023-24. Identify which team has had the highest and lowest average.

by_team_season <- schedule %>%
  filter(season >= 2014, season <= 2023) %>%
  group_by(team,season) %>%
  summarise(games_played = n(),
            four_in_six = nrow(count_4in6(pick(gamedate), gamedate)), 
            .groups = "drop") %>%
  mutate(four_in_six_per82 = four_in_six * 82 / games_played)

#Avg per team across seasons 
team_avgs <- by_team_season %>%
  group_by(team) %>%
  summarise(avg_4in6_per82 = mean(four_in_six_per82, na.rm = TRUE)) %>%
  arrange(desc(avg_4in6_per82))

#Identify highest and lowest 
highest <- team_avgs %>% slice(1)
lowest  <- team_avgs %>% slice(n()) 

highest 
## # A tibble: 1 × 2
##   team  avg_4in6_per82
##   <chr>          <dbl>
## 1 CHA             28.1
lowest
## # A tibble: 1 × 2
##   team  avg_4in6_per82
##   <chr>          <dbl>
## 1 NYK             22.2

The difference between the most CHA (28.1) and least NYK(22.2) is nearly 6 stretches per 82 games. This difference is likely the result of chance since it is fairly small compared to the amount of teams in the league and the 10 seasons considered.

Part 3 – Modeling

#Determine Most Hurt and Most Helped by schedule

sched_feat <- schedule %>%
  mutate(gamedate = as.Date(gamedate),
         away = (home == 0)) %>%
  filter(season >= 2019, season <= 2023) %>%
  arrange(team, gamedate) %>%
  group_by(team, season) %>%
  mutate(
    prev_date = lag(gamedate),
    next_date = lead(gamedate),
    
#B2B
    b2b_any = (!is.na(prev_date) & (gamedate - prev_date == 1)) |
              (!is.na(next_date) & (next_date - gamedate == 1)),

#Count 4in6
n_in_6 = purrr::map_int(gamedate, \(d) sum(gamedate >= d & gamedate <= d + 5)),
four_in_six = (n_in_6 >= 4),

#Road Trips 
prev_away = lag(away, default = FALSE),
new_trip  = away & !prev_away,
trip_id   = ifelse(away,cumsum(new_trip), NA_integer_)
) %>%
group_by(team, season, trip_id) %>%
mutate(road_trip_index = ifelse(away, row_number(), 0L)) %>%
ungroup() %>%
mutate(
  b2b             = as.integer(b2b_any),
  four_in_six     = as.integer(four_in_six),
  road_trip_index = replace_na(road_trip_index, 0L),
  home            = as.integer(home),
  win             = as.integer(win)
) %>%
select(season, team, gamedate, home, away, win, b2b, four_in_six, road_trip_index)

#Fit model 

sched_scored <- sched_feat %>%
  filter(season >= 2019, season <= 2023)

sched_model <- glm(
  win ~ b2b + four_in_six + road_trip_index + home,
  data = sched_scored,
    family = binomial()
) 
summary(sched_model)
## 
## Call:
## glm(formula = win ~ b2b + four_in_six + road_trip_index + home, 
##     family = binomial(), data = sched_scored)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -0.19060    0.05406  -3.526 0.000422 ***
## b2b             -0.14948    0.03956  -3.778 0.000158 ***
## four_in_six      0.03421    0.04083   0.838 0.402086    
## road_trip_index  0.01202    0.02052   0.586 0.558168    
## home             0.43869    0.05686   7.715 1.21e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16161  on 11657  degrees of freedom
## Residual deviance: 16017  on 11653  degrees of freedom
## AIC: 16027
## 
## Number of Fisher Scoring iterations: 3
sched_scored$pred_win_prob <- predict(sched_model, newdata = sched_scored, type = "response")

sched_scored <- sched_scored %>%
  mutate(expected_wins = ifelse(pred_win_prob > 0.50, 1, 0))

#Actual vs Expected 
  team_summary <- sched_scored %>%
    group_by(team) %>%
    summarise(
      total_actual_wins   = sum(win, na.rm = TRUE),
      total_expected_wins = sum(expected_wins, na.rm = TRUE),
      diff          = total_actual_wins - total_expected_wins,
      .groups       = "drop"
    ) %>%
  arrange(diff)

most_hurt   <- slice_head(team_summary, n = 1)
most_helped <- slice_tail(team_summary, n = 1)

most_helped
## # A tibble: 1 × 4
##   team  total_actual_wins total_expected_wins  diff
##   <chr>             <int>               <dbl> <dbl>
## 1 MIL                 260                 195    65
most_hurt
## # A tibble: 1 × 4
##   team  total_actual_wins total_expected_wins  diff
##   <chr>             <int>               <dbl> <dbl>
## 1 DET                  94                 191   -97
  • Most Helped by Schedule: MIL (260 wins)
  • Most Hurt by Schedule: DET (94 wins)

I fit a logistic regression model on games from 2019 to 2023 with wins as the outcome and schedule features as predictors. The estimates indicate that back to backs are influential in reducing win probability, while home court advantage increases a team’s chances to win. The four in six and road trip indicators are small so they are not significant to the outcome. Overall, the model points to venue and back to back density as the schedule factors with clear impact.